@inproceedings{nigatu-etal-2025-case,
title = "A Case Against Implicit Standards: Homophone Normalization in Machine Translation for Languages that use the {G}e{'}ez Script.",
author = "Nigatu, Hellina Hailu and
Tonja, Atnafu Lambebo and
Ademtew, Henok Biadglign and
Alemayehu, Hizkiel Mitiku and
Abadi, Negasi Haile and
Belay, Tadesse Destaw and
Yimam, Seid Muhie",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.523/",
doi = "10.18653/v1/2025.emnlp-main.523",
pages = "10309--10320",
ISBN = "979-8-89176-332-6",
abstract = "Homophone normalization{--}where characters that have the same sound in a writing script are mapped to one character{--}is a pre-processing step applied in Amharic Natural Language Processing (NLP) literature. While this may improve performance reported by automatic metrics, it also results in models that are unable to effectively process different forms of writing in a single language. Further, there might be impacts in transfer learning, where models trained on normalized data do not generalize well to other languages. In this paper, we experiment with monolingual training and cross-lingual transfer to understand the impacts of normalization on languages that use the Ge{'}ez script. We then propose a post-inference intervention in which normalization is applied to model predictions instead of training data. With our simple scheme of post-inference normalization, we show that we can achieve an increase in BLEU score of up to 1.03 while preserving language features in training."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="nigatu-etal-2025-case">
<titleInfo>
<title>A Case Against Implicit Standards: Homophone Normalization in Machine Translation for Languages that use the Ge’ez Script.</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hellina</namePart>
<namePart type="given">Hailu</namePart>
<namePart type="family">Nigatu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atnafu</namePart>
<namePart type="given">Lambebo</namePart>
<namePart type="family">Tonja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Henok</namePart>
<namePart type="given">Biadglign</namePart>
<namePart type="family">Ademtew</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hizkiel</namePart>
<namePart type="given">Mitiku</namePart>
<namePart type="family">Alemayehu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Negasi</namePart>
<namePart type="given">Haile</namePart>
<namePart type="family">Abadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tadesse</namePart>
<namePart type="given">Destaw</namePart>
<namePart type="family">Belay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seid</namePart>
<namePart type="given">Muhie</namePart>
<namePart type="family">Yimam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Homophone normalization–where characters that have the same sound in a writing script are mapped to one character–is a pre-processing step applied in Amharic Natural Language Processing (NLP) literature. While this may improve performance reported by automatic metrics, it also results in models that are unable to effectively process different forms of writing in a single language. Further, there might be impacts in transfer learning, where models trained on normalized data do not generalize well to other languages. In this paper, we experiment with monolingual training and cross-lingual transfer to understand the impacts of normalization on languages that use the Ge’ez script. We then propose a post-inference intervention in which normalization is applied to model predictions instead of training data. With our simple scheme of post-inference normalization, we show that we can achieve an increase in BLEU score of up to 1.03 while preserving language features in training.</abstract>
<identifier type="citekey">nigatu-etal-2025-case</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.523</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.523/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>10309</start>
<end>10320</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Case Against Implicit Standards: Homophone Normalization in Machine Translation for Languages that use the Ge’ez Script.
%A Nigatu, Hellina Hailu
%A Tonja, Atnafu Lambebo
%A Ademtew, Henok Biadglign
%A Alemayehu, Hizkiel Mitiku
%A Abadi, Negasi Haile
%A Belay, Tadesse Destaw
%A Yimam, Seid Muhie
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F nigatu-etal-2025-case
%X Homophone normalization–where characters that have the same sound in a writing script are mapped to one character–is a pre-processing step applied in Amharic Natural Language Processing (NLP) literature. While this may improve performance reported by automatic metrics, it also results in models that are unable to effectively process different forms of writing in a single language. Further, there might be impacts in transfer learning, where models trained on normalized data do not generalize well to other languages. In this paper, we experiment with monolingual training and cross-lingual transfer to understand the impacts of normalization on languages that use the Ge’ez script. We then propose a post-inference intervention in which normalization is applied to model predictions instead of training data. With our simple scheme of post-inference normalization, we show that we can achieve an increase in BLEU score of up to 1.03 while preserving language features in training.
%R 10.18653/v1/2025.emnlp-main.523
%U https://aclanthology.org/2025.emnlp-main.523/
%U https://doi.org/10.18653/v1/2025.emnlp-main.523
%P 10309-10320
Markdown (Informal)
[A Case Against Implicit Standards: Homophone Normalization in Machine Translation for Languages that use the Ge’ez Script.](https://aclanthology.org/2025.emnlp-main.523/) (Nigatu et al., EMNLP 2025)
ACL